/*** MATH TEST SCORES FOR SPRING 2010-2019 ***/
/*
This script reads individual-level math test score data. It summarizes the
score distributions, assigns information from score schedules, and then
processes into unique records by student-year. Finally, checks that scores
are valid according to score schedules.
*/

set more 1
clear all

do "paths.do"

capture log close
log using "$LOGFILES\tracking_1.log",replace

/*** 

TAKS was administered 2003-2011 in grades 3-11
STAAR was administered 2012+ in grades 3-8 with end-of-course tests in high school

Variables used in this do file:
 Invalid_id1_flag: binary indicator, =1 if id1 is invalid (means will not match across years)
 Invalid_id2_flag : relevant to merging with data outside TEA (does not predict 
  whether id2 will match across year for the same id1 or not)
 M_SCODE: math score code
  Blank=field is empty for some specific subject files (e.g., social studies and science)
  *=No information available for this subject
  A=Absent
  D=No information available for this subject
  G=TAKS alternate record
  L=Student is LEP-exempt, do not score
  M=Medical Exception
  N=No Authentic Academic Response
  O=Other (e.g., illness during testing, testing irregularity)
  P=Previously met standard
  R=ARD committee has determined that TAKS-M is not the appropriate assessment
      for SSI retest opportunities, the student meets participation requirement for TAKS-Alt
  S=Score
  X=Student is ARD exempt, do not score
 M_SSC: math scale score
 campus: 9-digit campus number 
 grade: 03-14
 year: administration year
 month: record in numbers (i.e., 4=April)
***/

**********************************
***Read in score schedule data ***
**********************************

** Read in score schedules from files we compiled from TEA raw score conversion PDFs
cd "$WORKING"
import delimited "$WORKING\tracking_0-schedules.csv", clear
rename test_ver M_TESTVER
rename scale_score M_SSC
keep year grade M_TESTVER M_SSC
save "$WORKING\tracking_1-schedules.dta", replace

** Read in score range data
import delimited "$WORKING\tracking_0-regRanges.csv", clear
rename grade Mgrade
keep year Mgrade scale_score_min scale_score_max
save "$WORKING\tracking_1-ranges.dta", replace


*************************************************
**** Combine scores across grades and years *****
*************************************************

foreach i in 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 {
	cd "$TESTS\\`i'"
	* creates a local with all of the TAKS and STAAR test score files present in that year
	local files : dir . files "*.dta"
	local k=0

	foreach file in `files' {

		disp "`file'"
		use "$TESTS\\`i'\\`file'", clear
		capture gen M_SSC = .
		capture gen M_TESTVER = ""
		capture gen M_SCODE = ""
		
		* determine whether file is a math test score file or not
		gen tmp = (M_SCODE == "")
		summ tmp
		local m=r(mean)
		assert (`m' < 0.02) | (`m' > 0.98)
		
		capture rename STUDENT_ID1 ID1
		capture rename STUDENT_INVALID_ID1_FLAG INVALID_ID1_FLAG
		capture rename STUDENT_ID2 ID2
		capture rename STUDENT_INVALID_ID2_FLAG INVALID_ID2_FLAG
		capture rename STUDENT_IN_PEIMS_SCHOOL_YEAR IN_PEIMS_SCHOOL_YEAR
		
		keep ID1 ID2 IN_PEIMS_SCHOOL_YEAR INVALID_ID2_FLAG INVALID_ID1_FLAG M_SCODE M_SSC M_TESTVER year month district campus grade
		tostring IN_PEIMS_SCHOOL_YEAR, replace
		destring M_SSC, replace
		
		* append to base dataset, if has any observations and is not the first run
		if (`m' < 0.02) {
			if (_N > 0) {
				gen fiscyr = `i'
				gen file_name = "`file'"
				if ((`k'!=0) | (`i' != 2010)) {
					append using "$WORKING/temp/tracking_1a"
				}
				qui save "$WORKING/temp/tracking_1a", replace
				local k = `k' + 1
			}

		}
		
		* "math" scores (with M_SCODE == "S") in non-math files are outside regular score ranges
		if ((`m' > 0.98) & (`m' < 1)) {
			tab M_SCODE, m
			summ M_SSC if (M_SCODE == "S"), detail
		}
	}
}


************************************************
**** Get actual score distributions by file ****
************************************************

use "$WORKING/temp/tracking_1a.dta", clear
di "$S_DATE $S_TIME: tracking_1a saved"

gen one0 = 1
collapse (sum) obs=one0, by(grade year file_name M_TESTVER M_SSC)

by grade year file_name M_TESTVER: gen ind = _n
egen tmp1 = group(grade year file_name M_TESTVER), missing
egen tmp2 = min(tmp1), by(grade year)
gen test_ind = 1 + tmp1 - tmp2
drop tmp1 tmp2

reshape wide file_name M_TESTVER M_SSC obs, i(grade year ind) j(test_ind)
export delimited using "$WORKING\tracking_1-distByFile.csv", replace

************************************
**** Clean sample and variables ****
************************************

use "$WORKING/temp/tracking_1a.dta", clear

** convert numeric variables
for var ID1 ID2 IN_PEIMS_SCHOOL_YEAR INVALID_ID1_FLAG INVALID_ID2_FLAG: rename X, lower
destring year month district campus grade in_peims_school_year invalid_id1_flag invalid_id2_flag, replace
* check that precision of campus is preserved
sort campus
format campus %10.0f
list campus in l

** keep only Spring (Jan-May) administrations
replace year=2000+year
tab year fiscyr, m
tab month year, m
keep if month>=1 & month<=5
* Note: as expected, now year and fiscyr match
tab year fiscyr, m
assert year==fiscyr
tab grade, m
drop fiscyr

** records not matching students enrolled in PEIMS
* if SSN is missing or invalid, these records cannot be matched to any other datasets
* Note: if in_peims_school_year is missing, we treat it as if it is equal to one
* (i.e., we only treat 0s as confirmed cases where the student is not enrolled)
* DROP these cases
tab year in_peims_school_year, m
tab in_peims_school_year invalid_id2_flag, m
gen tmp1=(in_peims_school_year==0 & (invalid_id2_flag==1|id2==""))
tabstat tmp1, by(year)
drop if tmp1==1

* set id1 to missing for remaining non-enrolled records
* regardless of whether id1 exists or not, it should not be used for matching
* either id1 will be missing or invalid_id1_flag will be 1
gen tmp2=(id1=="")
tab invalid_id1_flag tmp2 if in_peims_school_year==0, m
replace id1="" if in_peims_school_year==0
replace invalid_id1_flag=. if in_peims_school_year==0
* DROP if duplicate id2 for alternative record that matches enrolled student
egen tmp3=max(in_peims_school_year) if id2!="", by(id2 year)
drop if tmp3==1 & in_peims_school_year==0
drop tmp*

** reconciling student IDs and ID flags
* identify and drop conflicting mappings (if any)
* note: without "missing" specified, tag set to 0 if variable list has any missings
* so, tmp0=0 for all cases where id1=. or id2=.
egen tmp0=tag(id1 id2 year)
egen tmp1=total(tmp0) if id2!="", by(id2 year)
egen tmp2=total(tmp0) if id1!="", by(id1 year)
tab tmp1 tmp2, m cell
drop if (tmp1>1 & tmp1!=.)|(tmp2>1 & tmp2!=.)
* potentially fill in missing ID entries
egen tmp3=mode(id1) if id2!="", by(id2 year)
egen tmp4=mode(id2) if id1!="", by(id1 year)
replace id1=tmp3 if id1==""
replace id2=tmp4 if id2==""
* identify and drop if both IDs are missing
gen tmp5=id1==""
gen tmp6=id2==""
tab tmp5 tmp6, m cell
drop if tmp5==1 & tmp6==1
* check that flags are constant
tab invalid_id1_flag invalid_id2_flag, m cell
egen tmp7=max(invalid_id1_flag) if id1!="", by(id1 year)
egen tmp8=max(invalid_id2_flag) if id2!="", by(id2 year)
egen tmp9=max(in_peims_school_year), by(id1 id2 year)
tab invalid_id1_flag tmp7, m
tab invalid_id2_flag tmp8, m
tab in_peims_school_year tmp9, m
replace invalid_id1_flag=tmp7
replace invalid_id2_flag=tmp8
replace in_peims_school_year=tmp9
drop tmp*

save "$WORKING/temp/tracking_1b.dta", replace
di "$S_DATE $S_TIME: tracking_1b saved"

** flag scores that cannot be traced to official score schedules
use "$WORKING/temp/tracking_1b.dta", clear
merge m:1 year grade M_TESTVER M_SSC using "$WORKING\tracking_1-schedules.dta"
drop if _merge == 2
* flag for scores not appearing on an official score schedule
gen Msched = _merge == 1
* note: we only gathered information about STAAR score schedules for 2012+
tabstat Msched, by(year)
drop _merge

** set scores to missing for versions with modified or alternate standards
tab year M_TESTVER, m
tab M_SCODE M_TESTVER, m
replace M_SSC = . if M_TESTVER == "M"
replace M_SSC = . if M_TESTVER == "T"
replace Msched = . if M_TESTVER == "M"
replace Msched = . if M_TESTVER == "T"
replace Msched = . if M_SCODE != "S"
gen Mcdastan = (M_SCODE == "S") & ((M_TESTVER == "T") | (M_TESTVER == "M"))

** create variable with valid scaled scores and flags for non-scored test documents
tab M_SCODE year, m
tab year grade if M_SCODE==""
drop if M_SCODE==""
gen Mscore=M_SSC if M_SCODE == "S"
summ Mscore, detail
gen Mcdabs=(M_SCODE=="A")
gen Mcdother=(M_SCODE=="O")
gen Mcdlep=(M_SCODE=="L")
gen Mcdsped=(M_SCODE=="G"|M_SCODE=="M"|M_SCODE=="N"|M_SCODE=="R"|M_SCODE=="X")
gen Mcdpass=(M_SCODE=="P")
gen Mcdninfd = (M_SCODE=="D")
gen Mcdninfs = (M_SCODE=="*")
tab year grade if Mcdninfd==1
tab year grade if Mcdninfs==1

** clarify iff relationship between missing score and M-codes
egen tmp1=rsum(Mcd*)
gen tmp2 = (Mscore == .)
assert tmp1 == tmp2
drop M_SSC M_SCODE M_TESTVER tmp1 tmp2

* show the distribution of untraced scores
tab year grade if (Msched == 1) & (grade <= 8) & (Mscore != .)

save "$WORKING/temp/tracking_1c.dta", replace
di "$S_DATE $S_TIME: tracking_1c saved"

******************************************************
**** Keep only one observation per student / year ****
******************************************************

use "$WORKING/temp/tracking_1c.dta", clear

* keep the student X year record with a non-missing score, then earliest
* month, then lowest grade, then highest score. make sure that these variables
* uniquely identify obsevations.

* observe how many duplicates there are
sort id1 id2 year ///
		Mcdabs Mcdother Mcdlep Mcdsped Mcdpass Mcdninfd Mcdninfs Mcdastan ///
		month grade Mscore campus
by id1 id2 year ///
		Mcdabs Mcdother Mcdlep Mcdsped Mcdpass Mcdninfd Mcdninfs Mcdastan ///
		month grade Mscore campus: gen tmp1 = _n
tab year tmp1
drop tmp1

* drop duplicates
keep id1 id2 in_peims_school_year invalid_id1_flag invalid_id2_flag ///
		year month district campus grade ///
		Mscore Mcd* Msched
duplicates drop

* confirm that observations are unique
by id1 id2 year ///
		Mcdabs Mcdother Mcdlep Mcdsped Mcdpass Mcdninfd Mcdninfs Mcdastan ///
		month grade Mscore campus: gen tmp1 = _n
tab year tmp1
assert (tmp1 == 1)
drop tmp1

** show duplicates at each layer of sorting
by id1 id2 year Mcdabs Mcdother Mcdlep Mcdsped Mcdpass Mcdninfd Mcdninfs Mcdastan month grade Mscore: gen tmp1 = _n
by id1 id2 year Mcdabs Mcdother Mcdlep Mcdsped Mcdpass Mcdninfd Mcdninfs Mcdastan month grade: gen tmp2 = _n if (tmp1 == 1)
by id1 id2 year Mcdabs Mcdother Mcdlep Mcdsped Mcdpass Mcdninfd Mcdninfs Mcdastan month: gen tmp3 = _n if (tmp2 == 1)
by id1 id2 year Mcdabs Mcdother Mcdlep Mcdsped Mcdpass Mcdninfd Mcdninfs Mcdastan: gen tmp4 = _n if (tmp3 == 1)
by id1 id2 year: gen tmp5 = _n if (tmp4 == 1)
* different campuses across same scores
tab year tmp1
* different scores across same grades
tab year tmp2
* different grades across same months
tab year tmp3
* different months across same M-codes
tab year tmp4
* different M-codes across same students
tab year tmp5
drop tmp*


** sort in order of priority and in proper direction
gsort +id1 +id2 +year ///
		+Mcdabs +Mcdother +Mcdlep +Mcdsped +Mcdpass +Mcdninfd +Mcdninfs +Mcdastan ///
		+month +grade -Mscore +campus
by id1 id2 year: gen tmp1 = _n
tab year tmp1
keep if (tmp1 == 1)
drop tmp1
tab year grade

rename in_peims_school_year in_peims_flag
rename month Mmonth
rename grade Mgrade
rename district Mdist
rename campus Mcamp

la var Mdist "district from test files"
la var Mcamp "campus from test files"
la var year "fiscal year"
la var in_peims_flag "enrolled in PEIMS"
la var invalid_id1_flag "id1 invalid"
la var invalid_id2_flag "id2 invalid"
la var Mmonth "mth of test record"
la var Mgrade "grade of test record"
la var Mscore "math scale score"
la var Mcdabs "absent for test"
la var Mcdother "other - O"
la var Mcdlep "LEP exempt for test"
la var Mcdsped "spec ed exempt for test"
la var Mcdpass "prior pass exemption"
la var Mcdninfd "no information - D"
la var Mcdninfs "no information - *"
la var Mcdastan "alternative standard"
la var Msched "score not on schedule"


qui compress
keep id1 id2 Mdist Mcamp year *flag M*
order id1 id2 Mdist Mcamp year *flag M*
summ
descr
save "$WORKING/tracking_1.dta", replace


************************************
**** Ancillary tests and data ****
************************************

** create dataset with mean and standard deviation of each test (grade x year)
* we do not create z-scores at this point because retained / advanced students
* should be put on the scale of their current cohort / enrolled grade

use "$WORKING/tracking_1.dta", clear
keep if Mscore != .
collapse (mean) score_mean=Mscore (sd) score_sd=Mscore, by(Mgrade year)
rename Mgrade test_grade
rename year test_year
save "$WORKING\tracking_1-meansd.dta", replace

** create dataset with full distribution of scores
use "$WORKING/tracking_1.dta", clear
keep if (Mscore != .)
gen one0 = 1
collapse (count) nobs=one0, by(year Mgrade Mscore)
rename Mgrade test_grade
rename year test_year
rename Mscore score
save "$WORKING\tracking_1-dist.dta", replace

** check if there are any scores outside official score ranges
use "$WORKING/tracking_1.dta", clear
keep if (year >= 2012) & (Mgrade <= 8)
merge m:1 year Mgrade using "$WORKING\tracking_1-ranges.dta"
assert _merge == 3
drop _merge
gen bad_score = (Mscore != .) & ((Mscore < scale_score_min) | (Mscore > scale_score_max))
assert !bad_score

log close

